library(class)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
library(magrittr)
library(XML)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(stringi)
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
library(ggplot2)
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(httr)
##
## Attaching package: 'httr'
## The following object is masked from 'package:plotly':
##
## config
## The following object is masked from 'package:caret':
##
## progress
library(jsonlite)
library(e1071)
library(naniar)
library(scales)
library(gghighlight)
library(stringr)
library(knitr)
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:RCurl':
##
## complete
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
##
## subplot
## The following object is masked from 'package:rvest':
##
## html
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ggpubr)
knitr::opts_chunk$set(error = TRUE)
beers=read.csv('/Users/renfengwang/Documents/SMU\ Data\ Science\ Program/Doing\ Data\ Science/Project\ 1/Beers.csv',header=T)
brewery=read.csv('/Users/renfengwang/Documents/SMU\ Data\ Science\ Program/Doing\ Data\ Science/Project\ 1/Breweries.csv',header=T)
head(beers)
## Name Beer_ID ABV IBU Brewery_id
## 1 Pub Beer 1436 0.050 NA 409
## 2 Devil's Cup 2265 0.066 NA 178
## 3 Rise of the Phoenix 2264 0.071 NA 178
## 4 Sinister 2263 0.090 NA 178
## 5 Sex and Candy 2262 0.075 NA 178
## 6 Black Exodus 2261 0.077 NA 178
## Style Ounces
## 1 American Pale Lager 12
## 2 American Pale Ale (APA) 12
## 3 American IPA 12
## 4 American Double / Imperial IPA 12
## 5 American IPA 12
## 6 Oatmeal Stout 12
head(brewery)
## Brew_ID Name City State
## 1 1 NorthGate Brewing Minneapolis MN
## 2 2 Against the Grain Brewery Louisville KY
## 3 3 Jack's Abby Craft Lagers Framingham MA
## 4 4 Mike Hess Brewing Company San Diego CA
## 5 5 Fort Point Beer Company San Francisco CA
## 6 6 COAST Brewing Company Charleston SC
#Question 1
brewery %>% arrange(State) %>% ggplot(aes(x=State),count=Name)+geom_bar()+geom_text(aes(label=..count..),stat='count',vjust=-.5) +
xlab('States')+ ylab('Brewery Numbers') + ggtitle('Numbers of Brewery by State') #Bar plot to count the brewery numbers in each state
#Question 2
colnames(beers)[colnames(beers)=='Brewery_id']='Brew_ID' #Change one of the data frame column names before merging two data sets.
df_beer=merge(beers, brewery, by='Brew_ID', all=T) #Outer join two data frames.
colnames(df_beer)[colnames(df_beer)=='Name.x']='Beer_Name'
colnames(df_beer)[colnames(df_beer)=='Name.y']='Brewery_Name'
head(df_beer)
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 1 1 Get Together 2692 0.045 50 American IPA
## 2 1 Maggie's Leap 2691 0.049 26 Milk / Sweet Stout
## 3 1 Wall's End 2690 0.048 19 English Brown Ale
## 4 1 Pumpion 2689 0.060 38 Pumpkin Ale
## 5 1 Stronghold 2688 0.060 25 American Porter
## 6 1 Parapet ESB 2687 0.056 47 Extra Special / Strong Bitter (ESB)
## Ounces Brewery_Name City State
## 1 16 NorthGate Brewing Minneapolis MN
## 2 16 NorthGate Brewing Minneapolis MN
## 3 16 NorthGate Brewing Minneapolis MN
## 4 16 NorthGate Brewing Minneapolis MN
## 5 16 NorthGate Brewing Minneapolis MN
## 6 16 NorthGate Brewing Minneapolis MN
tail(df_beer)
## Brew_ID Beer_Name Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Brewery_Name City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
#Question 3
df_beerall=df_beer %>% replace_with_na_all(condition = ~.x=='')
df_beerall=df_beerall %>% replace_with_na_all(condition = ~.x==' ')
gg_miss_var(df_beerall)
table(is.na(df_beerall$IBU))
##
## FALSE TRUE
## 1405 1005
table(is.na(df_beerall$ABV))
##
## FALSE TRUE
## 2348 62
table(is.na(df_beerall$ABV | df_beerall$IBU))
##
## FALSE TRUE
## 2348 62
imp=mice(df_beerall, method='norm.predict', m=5)
##
## iter imp variable
## 1 1 ABV IBU
## 1 2 ABV IBU
## 1 3 ABV IBU
## 1 4 ABV IBU
## 1 5 ABV IBU
## 2 1 ABV IBU
## 2 2 ABV IBU
## 2 3 ABV IBU
## 2 4 ABV IBU
## 2 5 ABV IBU
## 3 1 ABV IBU
## 3 2 ABV IBU
## 3 3 ABV IBU
## 3 4 ABV IBU
## 3 5 ABV IBU
## 4 1 ABV IBU
## 4 2 ABV IBU
## 4 3 ABV IBU
## 4 4 ABV IBU
## 4 5 ABV IBU
## 5 1 ABV IBU
## 5 2 ABV IBU
## 5 3 ABV IBU
## 5 4 ABV IBU
## 5 5 ABV IBU
## Warning: Number of logged events: 5
beer_imp=complete(imp)
gg_miss_var(beer_imp)
table(is.na(beer_imp$IBU))
##
## FALSE
## 2410
table(is.na(beer_imp$ABV))
##
## FALSE
## 2410
table(is.na(beer_imp$Style))
##
## FALSE TRUE
## 2405 5
We can see there are missing values in Style, ABV and IBU columns. In ABV column, there are 62 missing values. In IBU column, there are 1005 missing values. In Style column, there are 5 missing values. I replaced the missing values in both ABV and IBU columns by linear regression method. I will replace the missing values in Beer Style column by searching their styles online as there are only 5 missing values.
#Question 4
Median_ABV_IBU=beer_imp %>% arrange(State) %>% group_by(State) %>% summarise(Median_ABV=median(ABV, na.rm=TRUE), Median_IBU=median(IBU,na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Median_ABV_IBU %>% ggplot(aes(x=State, y=Median_ABV,width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=percent(Median_ABV, accuracy = 0.01)),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Median Alcoholic Content')+ggtitle('Median Alcoholic Content by State')
Median_ABV_IBU %>% ggplot(aes(x=State, y=Median_IBU, width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=sprintf("%0.1f", round(Median_IBU, digits = 1))),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Median International Bitterness')+ggtitle('Median International Bitterness by State')
#Question 5
Max_ABV_IBU=beer_imp %>% arrange(State) %>% group_by(State) %>% summarise(Max_ABV=max(ABV, na.rm=TRUE), Max_IBU=max(IBU,na.rm=T))
## `summarise()` ungrouping output (override with `.groups` argument)
Max_ABV_IBU %>% ggplot(aes(x=State, y=Max_ABV,width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=percent(Max_ABV, accuracy = 0.01)),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Max Alcoholic Content')+ggtitle('Max Alcoholic Content by State')
Max_ABV_IBU %>% ggplot(aes(x=State, y=Max_IBU,width=.5))+geom_bar(stat='identity') +
geom_text(aes(label=sprintf("%0.1f", round(Max_IBU, digits = 1))),vjust=-.5,size=2.5,check_overlap = T) +
xlab('States') +ylab('Max International Bitterness')+ggtitle('Max International Bitterness by State')
We can see Colorado has the maximum Alcoholic beer with ABV 12.8% and Oregon has the most bitter beer with IBU 138.
#Question 6
beer_imp%>% summarise(Mean=mean(ABV, na.rm=TRUE),
Median=median(ABV,na.rm=T),
Min=min(ABV,na.rm=T),
Max=max(ABV,na.rm=T),
SD=sd(ABV,na.rm=T),
N=n())
## Mean Median Min Max SD N
## 1 0.05976504 0.057 0.001 0.128 0.01337556 2410
beer_imp %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV))+geom_histogram(aes(y=..density..),colour='black',fill='white')+
geom_density(alpha=.5, fill='#FF6666')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution of ABV is right skewed. ABV of beers around 5% has the most counts. There are total 2410 non-missing ABV values in this data set. The maximum ABV is 12.8%, the minimum ABV is .1%, the median ABV is 5.6%. The mean ABV is 5.98% and standard deviation of ABV is 1.35%.
#Question 7
beer_imp %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
ggplot(aes(y=ABV, x=IBU))+geom_point(position='jitter')+geom_smooth(method=loess)
## `geom_smooth()` using formula 'y ~ x'
rcorr(beer_imp$ABV, beer_imp$IBU,type='pearson')
## x y
## x 1.00 0.76
## y 0.76 1.00
##
## n= 2410
##
##
## P
## x y
## x 0
## y 0
ggscatter(beer_imp,x='IBU', y='ABV',add='reg.line',conf.int=T,cor.coef = T,cor.method='pearson')
## `geom_smooth()` using formula 'y ~ x'
Most beers with lower IBU (less than 50) have ABV values around 5%. When IBU value increases, ABV values spreads out. But most beers with IBU values above 50, their ABV values spread out within the region between 5% and 10%. We can find out that some dots have clearly linear regression as those are missing values which were replaced by linear regression method. Thus, the correlation coefficient is 0.76 which is probably larger than if I remove all missing values.
#Question 8 KNN
beer_imp %>% filter(is.na(beer_imp$Style))
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 1 30 Special Release 2210 0.06429940 47.97155 <NA>
## 2 67 OktoberFiesta 2527 0.05300000 27.00000 <NA>
## 3 161 Kilt Lifter Scottish-Style Ale 1635 0.06000000 21.00000 <NA>
## 4 167 The CROWLER™ 1796 0.07677540 61.09509 <NA>
## 5 167 CAN'D AID Foundation 1790 0.05815814 40.83925 <NA>
## Ounces Brewery_Name City State
## 1 16 Cedar Creek Brewery Seven Points TX
## 2 12 Freetail Brewing Company San Antonio TX
## 3 12 Four Peaks Brewing Company Tempe AZ
## 4 32 Oskar Blues Brewery Longmont CO
## 5 12 Oskar Blues Brewery Longmont CO
beer_imp$Style[beer_imp$Beer_ID=='2210']='Red Ale - American Amber/Red'
beer_imp$Style[beer_imp$Beer_ID=='2527']='Lager - Märzen/Oktoberfest'
beer_imp$Style[beer_imp$Beer_ID=='1635']='Scottish Ale'
beer_imp$Style[beer_imp$Beer_ID=='1796']='IPA - AMERICAN'
beer_imp$Style[beer_imp$Beer_ID=='1790']='FRUITED ALE'
df_beer_IPA=beer_imp %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
filter(str_detect(Style, regex(str_c('\\b','IPA','\\b',sep=''), ignore_case = T)))
df_beer_IPA$Style=as.factor('IPA')
df_beer_Ale=beer_imp %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
filter(str_detect(Style, regex(str_c('\\b','Ale','\\b',sep=''), ignore_case = T)))
df_beer_Ale$Style=as.factor('Ale')
df_beer_test=rbind(df_beer_IPA, df_beer_Ale)
iterations = 500
numks = 30
splitPerc = .7
masterAcc = matrix(nrow = iterations, ncol = numks)
set.seed(6)
trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
beer_train = df_beer_test[trainIndices,]
beer_test = df_beer_test[-trainIndices,]
classifications=knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style, prob = TRUE, k = 5)
CM = confusionMatrix(table(classifications,beer_test$Style))
classifications
## [1] IPA IPA IPA Ale Ale Ale IPA IPA IPA IPA Ale IPA IPA IPA IPA Ale IPA IPA
## [19] Ale IPA Ale Ale IPA IPA Ale Ale Ale IPA IPA IPA IPA Ale IPA IPA Ale IPA
## [37] IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale IPA IPA IPA Ale IPA
## [55] IPA IPA IPA IPA IPA IPA IPA IPA Ale IPA IPA IPA IPA IPA Ale IPA Ale IPA
## [73] IPA IPA Ale IPA IPA Ale IPA Ale Ale IPA IPA IPA Ale IPA IPA IPA IPA IPA
## [91] Ale Ale IPA Ale Ale IPA Ale Ale Ale IPA IPA IPA Ale IPA Ale IPA Ale Ale
## [109] IPA IPA Ale IPA IPA Ale Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale Ale
## [127] Ale IPA IPA Ale Ale IPA IPA Ale IPA IPA IPA Ale Ale IPA Ale IPA Ale Ale
## [145] Ale Ale Ale IPA IPA Ale Ale IPA IPA IPA IPA Ale Ale IPA Ale IPA Ale Ale
## [163] Ale Ale Ale IPA IPA Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [181] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale
## [199] Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [217] Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale IPA
## [235] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [253] Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale IPA Ale Ale
## [271] Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA IPA Ale Ale
## [289] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA IPA
## [307] IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA IPA Ale Ale Ale Ale
## [325] Ale IPA Ale Ale IPA IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [343] Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale
## [361] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale
## [379] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [397] IPA IPA Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale
## [415] IPA Ale Ale IPA IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA
## [433] Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale
## [451] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale
## attr(,"prob")
## [1] 0.6000000 1.0000000 0.6000000 0.6000000 0.6000000 0.6000000 1.0000000
## [8] 0.8888889 1.0000000 0.8000000 0.8235294 0.6000000 1.0000000 1.0000000
## [15] 1.0000000 0.6000000 0.6000000 1.0000000 0.8000000 0.8000000 0.6666667
## [22] 1.0000000 1.0000000 0.8888889 0.6000000 0.8000000 0.8000000 0.6000000
## [29] 0.8000000 0.8333333 0.8000000 0.6000000 0.8571429 0.8571429 0.8000000
## [36] 1.0000000 1.0000000 0.8000000 0.6000000 0.6000000 0.8000000 0.8571429
## [43] 0.9000000 0.6000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [50] 0.8888889 1.0000000 0.8000000 0.8333333 1.0000000 0.8571429 1.0000000
## [57] 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 0.6000000
## [64] 1.0000000 0.8333333 0.6000000 0.6666667 1.0000000 0.8000000 0.6000000
## [71] 0.8000000 0.8000000 0.6000000 0.6000000 1.0000000 1.0000000 0.6000000
## [78] 0.8000000 1.0000000 1.0000000 0.6666667 0.6000000 0.6000000 1.0000000
## [85] 0.6000000 0.6000000 0.8000000 0.8571429 0.8571429 0.8888889 0.8000000
## [92] 0.8000000 1.0000000 0.8000000 0.7142857 1.0000000 0.8000000 0.8000000
## [99] 0.8000000 0.6000000 1.0000000 0.8333333 0.8235294 0.6000000 1.0000000
## [106] 0.8571429 0.8333333 0.8000000 0.8000000 0.6000000 0.6000000 1.0000000
## [113] 0.6000000 0.8333333 0.8235294 0.6000000 1.0000000 0.8571429 0.8571429
## [120] 0.8571429 0.9000000 0.8888889 1.0000000 1.0000000 1.0000000 0.8000000
## [127] 0.6000000 1.0000000 0.8750000 0.6000000 0.8000000 0.8000000 0.9000000
## [134] 0.8000000 1.0000000 1.0000000 0.6000000 0.6666667 0.6000000 0.8000000
## [141] 0.8000000 0.6666667 0.8000000 0.8000000 0.8000000 0.8000000 0.6000000
## [148] 0.8333333 1.0000000 0.6000000 0.8235294 0.6000000 0.8333333 1.0000000
## [155] 1.0000000 0.6000000 1.0000000 0.6000000 0.6666667 0.6000000 0.8000000
## [162] 1.0000000 1.0000000 1.0000000 0.6000000 0.6000000 1.0000000 0.8000000
## [169] 0.6000000 1.0000000 0.8000000 0.8000000 1.0000000 0.7777778 1.0000000
## [176] 1.0000000 0.8000000 0.8000000 0.6000000 0.6000000 1.0000000 0.8333333
## [183] 0.7142857 1.0000000 1.0000000 0.6000000 0.8000000 0.6000000 1.0000000
## [190] 1.0000000 1.0000000 0.6666667 0.8000000 1.0000000 1.0000000 1.0000000
## [197] 0.6000000 0.8000000 1.0000000 0.6000000 0.6000000 1.0000000 1.0000000
## [204] 0.8000000 1.0000000 0.6000000 0.6000000 1.0000000 0.8000000 0.8000000
## [211] 1.0000000 0.6000000 1.0000000 1.0000000 0.8000000 0.6000000 0.6000000
## [218] 1.0000000 0.6250000 0.6666667 0.8000000 1.0000000 1.0000000 1.0000000
## [225] 0.6250000 0.8000000 1.0000000 1.0000000 1.0000000 0.8000000 1.0000000
## [232] 1.0000000 0.6000000 0.7500000 0.8000000 1.0000000 0.6000000 1.0000000
## [239] 1.0000000 1.0000000 0.7500000 0.6250000 0.6250000 1.0000000 0.8000000
## [246] 1.0000000 0.6000000 1.0000000 0.8235294 0.8000000 0.8000000 1.0000000
## [253] 0.6000000 0.6000000 1.0000000 0.8000000 0.8000000 1.0000000 1.0000000
## [260] 1.0000000 1.0000000 1.0000000 1.0000000 0.6000000 1.0000000 1.0000000
## [267] 0.8000000 0.6000000 1.0000000 0.8000000 1.0000000 0.8000000 0.8000000
## [274] 0.8000000 0.6000000 1.0000000 0.6000000 1.0000000 0.8000000 0.6000000
## [281] 1.0000000 1.0000000 1.0000000 0.8235294 0.6000000 1.0000000 1.0000000
## [288] 1.0000000 0.6000000 0.8000000 0.8000000 1.0000000 0.8000000 0.8000000
## [295] 1.0000000 0.8000000 1.0000000 0.6000000 1.0000000 1.0000000 0.6000000
## [302] 0.8000000 0.8000000 1.0000000 0.8333333 0.8000000 0.6000000 1.0000000
## [309] 1.0000000 0.6000000 0.7777778 0.8000000 0.8000000 1.0000000 1.0000000
## [316] 1.0000000 1.0000000 0.8000000 1.0000000 0.6000000 0.6666667 1.0000000
## [323] 1.0000000 0.6000000 1.0000000 0.6000000 1.0000000 0.8000000 1.0000000
## [330] 0.6000000 0.8000000 0.8000000 1.0000000 1.0000000 0.6666667 1.0000000
## [337] 0.6000000 1.0000000 0.8000000 0.7777778 1.0000000 1.0000000 1.0000000
## [344] 0.8333333 1.0000000 0.6000000 1.0000000 1.0000000 1.0000000 0.6000000
## [351] 0.8333333 0.6250000 1.0000000 1.0000000 1.0000000 1.0000000 0.6000000
## [358] 1.0000000 0.6000000 0.6250000 1.0000000 0.8000000 1.0000000 1.0000000
## [365] 1.0000000 0.6000000 0.8000000 1.0000000 1.0000000 0.7777778 1.0000000
## [372] 1.0000000 0.6000000 1.0000000 0.7777778 0.6000000 1.0000000 1.0000000
## [379] 1.0000000 0.6000000 0.6000000 1.0000000 0.6000000 1.0000000 0.8000000
## [386] 1.0000000 0.8000000 0.6000000 1.0000000 0.6250000 1.0000000 1.0000000
## [393] 1.0000000 0.6666667 1.0000000 0.6000000 0.6000000 0.6000000 0.5555556
## [400] 1.0000000 0.8000000 0.8888889 1.0000000 1.0000000 1.0000000 1.0000000
## [407] 0.8000000 1.0000000 0.8000000 0.6666667 0.6000000 0.8000000 0.8000000
## [414] 0.6000000 0.8000000 0.6666667 1.0000000 0.8333333 0.6000000 1.0000000
## [421] 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 0.6000000 1.0000000
## [428] 1.0000000 0.8000000 0.6666667 1.0000000 0.8000000 0.6666667 1.0000000
## [435] 1.0000000 0.8000000 1.0000000 1.0000000 0.6000000 1.0000000 1.0000000
## [442] 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8235294
## [449] 0.8571429 1.0000000 0.8571429 1.0000000 1.0000000 1.0000000 0.8000000
## [456] 0.8000000 1.0000000 0.6000000 0.8000000 1.0000000 0.8000000 0.8333333
## [463] 1.0000000 1.0000000 0.8000000
## Levels: IPA Ale
CM
## Confusion Matrix and Statistics
##
##
## classifications IPA Ale
## IPA 101 39
## Ale 55 270
##
## Accuracy : 0.7978
## 95% CI : (0.7584, 0.8334)
## No Information Rate : 0.6645
## P-Value [Acc > NIR] : 1.498e-10
##
## Kappa : 0.5348
##
## Mcnemar's Test P-Value : 0.1218
##
## Sensitivity : 0.6474
## Specificity : 0.8738
## Pos Pred Value : 0.7214
## Neg Pred Value : 0.8308
## Prevalence : 0.3355
## Detection Rate : 0.2172
## Detection Prevalence : 0.3011
## Balanced Accuracy : 0.7606
##
## 'Positive' Class : IPA
##
I replaced five missing values in beer style column missing by real beer style I got online. I randomly assigned 70% of original data to training data and 30% of the original data as testing data when seed value is set to 6. We can tell when k=5, the accuracy is about 79%.
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
beer_train = df_beer_test[trainIndices,]
beer_test = df_beer_test[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style, prob = TRUE, k = i)
table(classifications,beer_test$Style)
CM = confusionMatrix(table(classifications,beer_test$Style))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
p=ggplot(mapping=aes(x=seq(1,numks,1), y=MeanAcc))+geom_line()
ggplotly(p)
I shuffled the training and testing data 500 times by 70%/30% split. And assigned integer k value from 1 to 30. From the plot, we can tell when k=5, it gives us the highest accuracy 85% which means we can predict the beer is either India Pale Ales or any other types of Ale by knowing its ABV and IBU values with 85% accuracy when sets nearest neighbor numbers equals to 5
#Naive Bayes--------------
iterations = 500
masterAcc = matrix(nrow = iterations)
masterSen = matrix(nrow = iterations)
masterSpec = matrix(nrow = iterations)
splitPerc = .7
for(j in 1:iterations)
{
trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
beer_train = df_beer_test[trainIndices,]
beer_test = df_beer_test[-trainIndices,]
model = naiveBayes(beer_train[,c(4,5)],as.factor(beer_train$Style),laplace = 1)
table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style))
CM = confusionMatrix(table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style)))
masterAcc[j] = CM$overall[1]
masterSen[j] = CM$byClass[1]
masterSpec[j] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.7953247
MeanSen
## [1] 0.6749864
MeanSpec
## [1] 0.8655125
Now I chose to use Naive Bayes model as I want to compare the accuracy with KNN model. I shuffled the training and testing data 500 times by 70%/30% split like I did in KNN model. The mean accuracy of Naive Bayes is 84% which is similar to what we got from KNN.
#Question 9 Sorted Lager, Stout, IPA
df_beer_stout=beer_imp %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
filter(str_detect(Style, regex(str_c('\\b','stout','\\b',sep=''), ignore_case = T)))
df_beer_lager=beer_imp %>% filter(!is.na(ABV) &!is.na(IBU)) %>%
filter(str_detect(Style, regex(str_c('\\b','lager','\\b',sep=''), ignore_case = T)))
df_beer_stout$Style=as.factor('Stout')
df_beer_lager$Style=as.factor('Lager')
df_beer_sort=rbind(df_beer_IPA, df_beer_stout)
df_beer_sort=rbind(df_beer_sort, df_beer_lager)
#Naive Bayes--------------
iterations = 500
masterAcc = matrix(nrow = iterations)
masterSen = matrix(nrow = iterations)
masterSpec = matrix(nrow = iterations)
splitPerc = .7
for(j in 1:iterations)
{
trainIndices = sample(1:dim(df_beer_sort)[1],round(splitPerc * dim(df_beer_sort)[1]))
beer_sort_train = df_beer_sort[trainIndices,]
beer_sort_test = df_beer_sort[-trainIndices,]
model = naiveBayes(beer_sort_train[,c(4,5)],as.factor(beer_sort_train$Style),laplace = 1)
table(predict(model,beer_sort_test[,c(4,5)]),as.factor(beer_sort_test$Style))
CM = confusionMatrix(table(predict(model,beer_sort_test[,c(4,5)]),as.factor(beer_sort_test$Style)))
masterAcc[j] = CM$overall[1]
masterSen[j] = CM$byClass[1]
masterSpec[j] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.7944096
MeanSen
## [1] 0.9040672
MeanSpec
## [1] 0.005961902
Now I want to use IBU and ABV values to predict non-IPA Ales, lager and stout these three beer styles. I shuffled the training and testing data 500 times by 70%/30% split like I did before. First I used Naive Bayes method. The mean accuracy is about 84.5%.
#-----------KNN
iterations = 500
numks = 30
splitPerc = .7
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(df_beer_sort)[1],round(splitPerc * dim(df_beer_sort)[1]))
beer_sort_train = df_beer_sort[trainIndices,]
beer_sort_test = df_beer_sort[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(beer_sort_train[,c(4,5)],beer_sort_test[,c(4,5)],beer_sort_train$Style, prob = TRUE, k = i)
table(classifications,beer_sort_test$Style)
CM = confusionMatrix(table(classifications,beer_sort_test$Style))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
e=ggplot(mapping=aes(x=seq(1,numks,1), y=MeanAcc))+geom_line()
ggplotly(e)
Now I used KNN method. The highest mean accuracy is 80.62% when k=29.We can see from the plot that accuracy dropped when k=2, but keeps increasing when k>=4.
trainIndices = sample(1:dim(df_beer_sort)[1],round(splitPerc * dim(df_beer_sort)[1]))
beer_sort_train = df_beer_sort[trainIndices,]
beer_sort_test = df_beer_sort[-trainIndices,]
classifications_knn=knn(beer_sort_train[,c(4,5)],beer_sort_test[,c(4,5)],beer_sort_train$Style, prob = TRUE, k = 5)
CM_knn = confusionMatrix(table(classifications,beer_sort_test$Style))
classifications_knn
## [1] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [13] IPA IPA IPA IPA IPA IPA IPA IPA IPA Lager IPA IPA
## [25] IPA IPA IPA Stout Stout IPA IPA IPA IPA IPA Stout IPA
## [37] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [49] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [61] IPA IPA IPA IPA Lager IPA IPA IPA IPA IPA IPA IPA
## [73] IPA IPA IPA IPA IPA IPA IPA Lager IPA IPA IPA IPA
## [85] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [97] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA Lager IPA
## [109] IPA Stout IPA Lager IPA IPA IPA IPA IPA IPA IPA IPA
## [121] IPA IPA IPA IPA IPA IPA IPA IPA Stout IPA IPA IPA
## [133] IPA IPA IPA IPA Lager IPA IPA IPA IPA IPA IPA IPA
## [145] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [157] IPA IPA IPA IPA IPA IPA Lager IPA Lager IPA Lager IPA
## [169] IPA IPA IPA Lager IPA IPA IPA IPA Lager IPA IPA Stout
## [181] IPA IPA Lager IPA IPA IPA Lager IPA IPA IPA IPA IPA
## [193] IPA IPA IPA Lager IPA IPA Lager Lager Lager Lager Lager Stout
## [205] Lager Lager Lager Lager IPA Lager Lager Lager IPA Lager Stout Lager
## [217] Lager Lager Lager Stout Lager Lager Lager Stout Stout Lager Lager Lager
## [229] Lager Lager IPA Stout Lager Lager Lager Lager Lager Lager Lager Lager
## [241] Lager Lager Stout Lager Lager Lager Lager Lager Lager
## attr(,"prob")
## [1] 0.8000000 1.0000000 1.0000000 1.0000000 0.8333333 0.8000000 0.8000000
## [8] 0.8000000 0.8000000 1.0000000 1.0000000 1.0000000 0.6000000 0.8000000
## [15] 0.8571429 0.8000000 1.0000000 0.8888889 1.0000000 1.0000000 0.4000000
## [22] 0.8000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000 0.4000000
## [29] 0.6000000 0.8000000 1.0000000 1.0000000 0.8000000 1.0000000 0.6000000
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.5000000 0.8000000
## [43] 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [50] 1.0000000 1.0000000 1.0000000 0.8000000 0.8571429 1.0000000 1.0000000
## [57] 1.0000000 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000
## [64] 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000
## [71] 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8333333
## [78] 1.0000000 1.0000000 1.0000000 0.8000000 0.8000000 1.0000000 1.0000000
## [85] 0.6666667 1.0000000 1.0000000 0.8333333 1.0000000 0.8000000 1.0000000
## [92] 0.8000000 1.0000000 1.0000000 0.8000000 0.8888889 1.0000000 1.0000000
## [99] 0.6000000 0.8000000 1.0000000 1.0000000 1.0000000 0.6000000 1.0000000
## [106] 1.0000000 1.0000000 1.0000000 1.0000000 0.6000000 1.0000000 0.8000000
## [113] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 0.8000000
## [120] 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [127] 1.0000000 1.0000000 0.6000000 0.8333333 1.0000000 1.0000000 1.0000000
## [134] 1.0000000 1.0000000 1.0000000 0.6000000 0.8571429 1.0000000 1.0000000
## [141] 0.8000000 0.8000000 0.6000000 0.8000000 1.0000000 0.8000000 1.0000000
## [148] 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [155] 0.8000000 1.0000000 1.0000000 1.0000000 0.8000000 1.0000000 0.8000000
## [162] 0.5555556 1.0000000 1.0000000 0.6000000 1.0000000 0.8000000 1.0000000
## [169] 1.0000000 0.8000000 0.8888889 0.6666667 0.8000000 0.6000000 0.4000000
## [176] 0.8000000 0.4000000 0.8000000 1.0000000 0.6000000 0.6000000 0.8000000
## [183] 0.6000000 0.8000000 1.0000000 0.6000000 0.6000000 0.6000000 0.8888889
## [190] 1.0000000 1.0000000 0.6000000 0.8000000 0.8000000 0.8000000 0.8000000
## [197] 1.0000000 0.8888889 0.6000000 0.8000000 0.8000000 0.8333333 0.6000000
## [204] 0.6000000 0.6666667 0.6250000 0.6000000 0.6000000 0.8000000 1.0000000
## [211] 1.0000000 1.0000000 0.6000000 0.6666667 0.6000000 0.8000000 0.5000000
## [218] 0.6000000 1.0000000 0.6000000 1.0000000 1.0000000 0.4000000 0.6000000
## [225] 0.6000000 1.0000000 0.6666667 1.0000000 1.0000000 0.8333333 1.0000000
## [232] 0.6000000 1.0000000 0.8000000 0.6000000 0.6666667 1.0000000 1.0000000
## [239] 1.0000000 0.6000000 1.0000000 0.5000000 0.6000000 1.0000000 0.6000000
## [246] 0.8000000 0.8000000 0.8000000 0.6000000
## Levels: IPA Stout Lager
CM_knn
## Confusion Matrix and Statistics
##
##
## classifications IPA Stout Lager
## IPA 153 26 6
## Stout 0 0 0
## Lager 9 12 43
##
## Overall Statistics
##
## Accuracy : 0.7871
## 95% CI : (0.731, 0.8363)
## No Information Rate : 0.6506
## P-Value [Acc > NIR] : 1.876e-06
##
## Kappa : 0.5433
##
## Mcnemar's Test P-Value : 2.110e-08
##
## Statistics by Class:
##
## Class: IPA Class: Stout Class: Lager
## Sensitivity 0.9444 0.0000 0.8776
## Specificity 0.6322 1.0000 0.8950
## Pos Pred Value 0.8270 NaN 0.6719
## Neg Pred Value 0.8594 0.8474 0.9676
## Prevalence 0.6506 0.1526 0.1968
## Detection Rate 0.6145 0.0000 0.1727
## Detection Prevalence 0.7430 0.0000 0.2570
## Balanced Accuracy 0.7883 0.5000 0.8863
Here is one of the case I shuffled the testing and training data once and the accuracy we got is 79.9% when k=5
#Back to Question 7
df_beer_study=rbind(df_beer_sort, df_beer_Ale)
df_beer_study %>% ggplot(aes(y=ABV, x=IBU))+geom_point(position='jitter')+geom_smooth() +facet_wrap(~Style)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Now we came back to Question 7 by checking four different styles of beer. We can see, majority of lager style beer has low IBU and its ABV values are around 5%. No ABV values of lager beer are above 7.5%. ABV values of Indian Pale Ale style seems like increases when IBU increases, but their ABV values don’t pass 10% and majority of their ABV values are between 5% to 10%. The other types of Ale beer have low IBU values which most of them are below 50. Also, most of their ABV values are between 3.75% to 10%. The data size of stout style beer is small, but we can roughly tell their ABV increases when IBU increases.